library(tidyverse)
library(dplyr)
library(gridExtra)
library(kableExtra)
library(ggthemr)
library(leaflet)
library(leaflet.extras)
library(wordcloud)
library(tm)
library(NLP)
library(corrplot)
ggthemr('dust')
airbnb <- read.csv("AB_NYC_2019.csv", stringsAsFactors = F, na.strings = c(""))
head(airbnb) %>% kable() %>% kable_styling() %>% scroll_box(width="910px")
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 3647 | THE VILLAGE OF HARLEM….NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NA | NA | 1 | 365 |
| 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
| 5099 | Large Cozy 1 BR Apartment In Midtown East | 7322 | Chris | Manhattan | Murray Hill | 40.74767 | -73.97500 | Entire home/apt | 200 | 3 | 74 | 2019-06-22 | 0.59 | 1 | 129 |
str(airbnb)
## 'data.frame': 48895 obs. of 16 variables:
## $ id : int 2539 2595 3647 3831 5022 5099 5121 5178 5203 5238 ...
## $ name : chr "Clean & quiet apt home by the park" "Skylit Midtown Castle" "THE VILLAGE OF HARLEM....NEW YORK !" "Cozy Entire Floor of Brownstone" ...
## $ host_id : int 2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
## $ host_name : chr "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
## $ neighbourhood_group : chr "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
## $ neighbourhood : chr "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
## $ latitude : num 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num -74 -74 -73.9 -74 -73.9 ...
## $ room_type : chr "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
## $ price : int 149 225 150 89 80 200 60 79 79 150 ...
## $ minimum_nights : int 1 1 3 1 10 3 45 2 2 1 ...
## $ number_of_reviews : int 9 45 0 270 9 74 49 430 118 160 ...
## $ last_review : chr "2018-10-19" "2019-05-21" NA "2019-07-05" ...
## $ reviews_per_month : num 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ calculated_host_listings_count: int 6 2 1 1 1 1 1 1 1 4 ...
## $ availability_365 : int 365 355 365 194 0 129 0 220 0 188 ...
summary(airbnb)
## id name host_id host_name
## Min. : 2539 Length:48895 Min. : 2438 Length:48895
## 1st Qu.: 9471945 Class :character 1st Qu.: 7822033 Class :character
## Median :19677284 Mode :character Median : 30793816 Mode :character
## Mean :19017143 Mean : 67620011
## 3rd Qu.:29152178 3rd Qu.:107434423
## Max. :36487245 Max. :274321313
##
## neighbourhood_group neighbourhood latitude longitude
## Length:48895 Length:48895 Min. :40.50 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.71
##
## room_type price minimum_nights number_of_reviews
## Length:48895 Min. : 0.0 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.: 69.0 1st Qu.: 1.00 1st Qu.: 1.00
## Mode :character Median : 106.0 Median : 3.00 Median : 5.00
## Mean : 152.7 Mean : 7.03 Mean : 23.27
## 3rd Qu.: 175.0 3rd Qu.: 5.00 3rd Qu.: 24.00
## Max. :10000.0 Max. :1250.00 Max. :629.00
##
## last_review reviews_per_month calculated_host_listings_count
## Length:48895 Min. : 0.010 Min. : 1.000
## Class :character 1st Qu.: 0.190 1st Qu.: 1.000
## Mode :character Median : 0.720 Median : 1.000
## Mean : 1.373 Mean : 7.144
## 3rd Qu.: 2.020 3rd Qu.: 2.000
## Max. :58.500 Max. :327.000
## NA's :10052
## availability_365
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 45.0
## Mean :112.8
## 3rd Qu.:227.0
## Max. :365.0
##
airbnb[,"id"] <- NULL
airbnb[,"host_id"] <- NULL
airbnb[,"host_name"] <- NULL
missing_airbnb <- as.data.frame(table(which(is.na(airbnb),arr.ind = T)[,2]))
missing_airbnb$perc <- missing_airbnb$Freq / dim(airbnb)[1]
missing_airbnb$Var1 <- as.character(missing_airbnb$Var1)
missing_airbnb$Var1[1] <- colnames(airbnb)[as.numeric(missing_airbnb$Var1[1])]
missing_airbnb$Var1[2] <- colnames(airbnb)[as.numeric(missing_airbnb$Var1[2])]
missing_airbnb$Var1[3] <- colnames(airbnb)[as.numeric(missing_airbnb$Var1[3])]
ggplot(missing_airbnb, aes(x=missing_airbnb$Var1, y=missing_airbnb$perc)) +
geom_bar(stat = "identity", aes(color = I('white')), size = 0.3) +
coord_flip() +
ylim(0,1) +
ylab("Percentage missing") +
xlab("Column name") +
ggtitle("Missing Data", subtitle = "name has less than 0.001 percentage missing")
airbnb_name <- airbnb[,"name"]
airbnb[, "name"] <- NULL
airbnb[, "last_review"] <- NULL
airbnb[, "reviews_per_month"] <- NULL
The both column “reviews_per_month” and “last_review” are missing about 20.56%, which means these two variable have some relation. Because of 20.56% lossing, we could not use these two column. Although “name” has las than 0.001 percentage missing, we cannot fill them, so we delete those rows.
Location and Price are the most important factors for people choosing homestay.
In the data, neighbourhood_group means location and neighbourhood means area
# neighbourhood_group
airbnb$neighbourhood_group <- as.factor(airbnb$neighbourhood_group)
summary(airbnb$neighbourhood_group)
## Bronx Brooklyn Manhattan Queens Staten Island
## 1091 20104 21661 5666 373
NGlocation <- as.data.frame(table(airbnb$neighbourhood_group))
colnames(NGlocation) <- c('Location', 'Freq')
ggplot(NGlocation, aes(x=NGlocation$Location, y=NGlocation$Freq, fill=NGlocation$Location)) +
geom_bar(stat = "identity", aes(color = I('white')), size = 0.3) +
theme(legend.position = "none") +
xlab("Location") +
ylab("Number of Houses")
As we can see, Brookly and Manhattan are two most popular place that people rent their houses.
airbnb$neighbourhood <- as.factor(airbnb$neighbourhood)
head(summary(airbnb$neighbourhood), 20)
## Williamsburg Bedford-Stuyvesant Harlem Bushwick
## 3920 3714 2658 2465
## Upper West Side Hell's Kitchen East Village Upper East Side
## 1971 1958 1853 1798
## Crown Heights Midtown East Harlem Greenpoint
## 1564 1545 1117 1115
## Chelsea Lower East Side Astoria Washington Heights
## 1113 911 900 899
## West Village Financial District Flatbush Clinton Hill
## 768 744 621 572
# It is to much area; but as we can see, arounding 10 Area that people are most liking to rent their house.
NArea <- as.data.frame(table(airbnb$neighbourhood))
colnames(NArea) <- c('Area', 'Freq')
NArea <- filter(NArea, Freq > 1000)
NArea <- arrange(NArea, NArea$Freq)
ggplot(NArea, aes(x=NArea$Area, y=NArea$Freq)) +
geom_bar(stat = "identity") +
theme(legend.position = "none") +
xlab("Area") +
ylab("Number of Houses") +
coord_flip()
nycLatLong <- data.frame(lat = airbnb$latitude, lng = airbnb$longitude)
nycLatLong %>%
leaflet() %>%
addTiles() %>%
addProviderTiles('HikeBike.HikeBike') %>%
addProviderTiles(providers$Stamen.Toner, group = "Toner") %>%
addProviderTiles(providers$CartoDB.DarkMatter, group = "DarkMatter") %>%
addProviderTiles(providers$Esri.WorldImagery, group = "ESRI-Sat") %>%
addWebGLHeatmap(lng = airbnb$longitude, lat = airbnb$latitude, size = 500) %>%
addLayersControl(
baseGroups = c("OSM (default)",
"Toner",
"DarkMatter",
"ESRI-Sat"),
options = layersControlOptions(collapsed = FALSE)
)
AirPrice <- as.data.frame(table(airbnb$price))
colnames(AirPrice) <- c("Price", "Freq")
ggplot(airbnb, aes(x=airbnb$price)) +
geom_density() +
ggtitle("Distribution of price",
subtitle = "The distribution is very skewed") +
xlab("Price")
The distribution is very skewed, and the graph means the most Price is under $2500, so that we change the x-axis.
AirPrice <- as.data.frame(table(airbnb$price))
colnames(AirPrice) <- c("Price", "Freq")
ggplot(airbnb, aes(x=airbnb$price)) +
geom_density() +
ggtitle("Distribution of price",
subtitle = "Mean price = $142.31") +
xlab("Price") +
scale_x_log10() +
geom_vline(xintercept = round(mean(airbnb$price), 2), size = 1)
airbnb_nh <- airbnb %>%
group_by(neighbourhood_group) %>%
summarise(price = round(mean(price), 2))
ggplot(airbnb, aes(price)) +
geom_density() +
ggtitle("Transformed distribution of price\n by neighbourhood groups",
subtitle = expression("With" ~'log'[10] ~ "transformation of x-axis")) +
geom_vline(data = airbnb_nh, aes(xintercept = price), size = 1) +
geom_text(data = airbnb_nh,y = 1.5, aes(x = price + 1400, label = paste("Mean = ",price)), color = "darkgreen", size = 4) +
facet_wrap(~neighbourhood_group) +
scale_x_log10()
Five locations have similar distribution, but Manhattan have the highest price in five locations.
wordcloud(airbnb_name)
In the name of the houses, hosts like use “private”, “manhattan”, “brooklyn”, “charming”, etc.
airbnb$room_type <- factor(airbnb$room_type)
RoomType <- as.data.frame(table(airbnb$room_type))
colnames(RoomType) <- c("RoomType","Freq")
str(RoomType)
## 'data.frame': 3 obs. of 2 variables:
## $ RoomType: Factor w/ 3 levels "Entire home/apt",..: 1 2 3
## $ Freq : int 25409 22326 1160
ggplot(RoomType, aes(x=2, y=Freq, fill=RoomType)) +
geom_bar(stat = "identity", color = "white", width=0.8) +
scale_fill_brewer(palette=4) +
coord_polar(theta="y") +
xlim(0.5, 2.5) +
theme_void()
Most host will rent the “Entire home/apt” and “Private room”.
ggplot(airbnb, aes(x=airbnb$room_type, y=airbnb$number_of_reviews, fill=airbnb$room_type)) +
geom_boxplot() +
geom_hline(yintercept = mean(airbnb$number_of_reviews), color = "purple", linetype = 2) +
scale_y_log10() +
theme(legend.position = "none") +
xlab("Room Type") +
ylab("Number of Reviews")
Each kinds of Room Type will have similar Number of Reviews
airMutiReg <- airbnb[,c("number_of_reviews", "neighbourhood_group", "room_type", "price", "minimum_nights", "availability_365")]
airMutiReg$neighbourhood_group <- factor(airMutiReg$neighbourhood_group)
str(airMutiReg)
## 'data.frame': 48895 obs. of 6 variables:
## $ number_of_reviews : int 9 45 0 270 9 74 49 430 118 160 ...
## $ neighbourhood_group: Factor w/ 5 levels "Bronx","Brooklyn",..: 2 3 3 2 3 3 2 3 3 3 ...
## $ room_type : Factor w/ 3 levels "Entire home/apt",..: 2 1 2 1 1 1 2 2 2 1 ...
## $ price : int 149 225 150 89 80 200 60 79 79 150 ...
## $ minimum_nights : int 1 1 3 1 10 3 45 2 2 1 ...
## $ availability_365 : int 365 355 365 194 0 129 0 220 0 188 ...
airMROneHot <- cbind(select_if(airMutiReg,is.numeric),as.data.frame(model.matrix(~room_type-1,airMutiReg)))
airMROneHot <- cbind(select_if(airMROneHot,is.numeric),as.data.frame(model.matrix(~neighbourhood_group-1,airMutiReg)))
cor(airMROneHot)
## number_of_reviews price minimum_nights
## number_of_reviews 1.000000000 -0.04795423 -0.080116068
## price -0.047954227 1.00000000 0.042799334
## minimum_nights -0.080116068 0.04279933 1.000000000
## availability_365 0.172027581 0.08182883 0.144303063
## room_typeEntire home/apt -0.010087231 0.25585665 0.074899803
## room_typePrivate room 0.017253226 -0.24024642 -0.073836539
## room_typeShared room -0.023354904 -0.05361282 -0.004217946
## neighbourhood_groupBronx 0.009257903 -0.04102998 -0.018185701
## neighbourhood_groupBrooklyn 0.017413636 -0.09860261 -0.039658304
## neighbourhood_groupManhattan -0.045820056 0.16397551 0.067362031
## neighbourhood_groupQueens 0.035966538 -0.08020500 -0.032629026
## neighbourhood_groupStaten Island 0.015088160 -0.01383994 -0.009399622
## availability_365 room_typeEntire home/apt
## number_of_reviews 0.172027581 -0.010087231
## price 0.081828827 0.255856647
## minimum_nights 0.144303063 0.074899803
## availability_365 1.000000000 -0.006804234
## room_typeEntire home/apt -0.006804234 1.000000000
## room_typePrivate room -0.010985839 -0.953470178
## room_typeShared room 0.058293890 -0.162143592
## neighbourhood_groupBronx 0.060806165 -0.052092280
## neighbourhood_groupBrooklyn -0.079670621 -0.073905066
## neighbourhood_groupManhattan -0.005433606 0.160081940
## neighbourhood_groupQueens 0.087112501 -0.108505293
## neighbourhood_groupStaten Island 0.057884814 -0.008390983
## room_typePrivate room room_typeShared room
## number_of_reviews 0.017253226 -0.0233549043
## price -0.240246424 -0.0536128152
## minimum_nights -0.073836539 -0.0042179463
## availability_365 -0.010985839 0.0582938901
## room_typeEntire home/apt -0.953470178 -0.1621435919
## room_typePrivate room 1.000000000 -0.1428987359
## room_typeShared room -0.142898736 1.0000000000
## neighbourhood_groupBronx 0.042765002 0.0310413460
## neighbourhood_groupBrooklyn 0.079464133 -0.0174667259
## neighbourhood_groupManhattan -0.157762374 -0.0091690009
## neighbourhood_groupQueens 0.100676348 0.0266930678
## neighbourhood_groupStaten Island 0.008345062 0.0002329651
## neighbourhood_groupBronx
## number_of_reviews 0.009257903
## price -0.041029979
## minimum_nights -0.018185701
## availability_365 0.060806165
## room_typeEntire home/apt -0.052092280
## room_typePrivate room 0.042765002
## room_typeShared room 0.031041346
## neighbourhood_groupBronx 1.000000000
## neighbourhood_groupBrooklyn -0.126238876
## neighbourhood_groupManhattan -0.134729839
## neighbourhood_groupQueens -0.054692899
## neighbourhood_groupStaten Island -0.013245408
## neighbourhood_groupBrooklyn
## number_of_reviews 0.01741364
## price -0.09860261
## minimum_nights -0.03965830
## availability_365 -0.07967062
## room_typeEntire home/apt -0.07390507
## room_typePrivate room 0.07946413
## room_typeShared room -0.01746673
## neighbourhood_groupBronx -0.12623888
## neighbourhood_groupBrooklyn 1.00000000
## neighbourhood_groupManhattan -0.74524041
## neighbourhood_groupQueens -0.30252659
## neighbourhood_groupStaten Island -0.07326523
## neighbourhood_groupManhattan
## number_of_reviews -0.045820056
## price 0.163975505
## minimum_nights 0.067362031
## availability_365 -0.005433606
## room_typeEntire home/apt 0.160081940
## room_typePrivate room -0.157762374
## room_typeShared room -0.009169001
## neighbourhood_groupBronx -0.134729839
## neighbourhood_groupBrooklyn -0.745240413
## neighbourhood_groupManhattan 1.000000000
## neighbourhood_groupQueens -0.322874856
## neighbourhood_groupStaten Island -0.078193131
## neighbourhood_groupQueens
## number_of_reviews 0.03596654
## price -0.08020500
## minimum_nights -0.03262903
## availability_365 0.08711250
## room_typeEntire home/apt -0.10850529
## room_typePrivate room 0.10067635
## room_typeShared room 0.02669307
## neighbourhood_groupBronx -0.05469290
## neighbourhood_groupBrooklyn -0.30252659
## neighbourhood_groupManhattan -0.32287486
## neighbourhood_groupQueens 1.00000000
## neighbourhood_groupStaten Island -0.03174211
## neighbourhood_groupStaten Island
## number_of_reviews 0.0150881597
## price -0.0138399434
## minimum_nights -0.0093996218
## availability_365 0.0578848136
## room_typeEntire home/apt -0.0083909829
## room_typePrivate room 0.0083450616
## room_typeShared room 0.0002329651
## neighbourhood_groupBronx -0.0132454075
## neighbourhood_groupBrooklyn -0.0732652325
## neighbourhood_groupManhattan -0.0781931311
## neighbourhood_groupQueens -0.0317421076
## neighbourhood_groupStaten Island 1.0000000000
corrplot(corr=cor(airMROneHot),order = "AOE")
Which means those variable have weak correlation.